While we have seen success predicting homework from notebook, we would like to know if we can separate the notebooks in an unsupervised way. We will try a few clustering techniques, using dimensionality reduction to visualize and sillouette score to measure cluster fit.
In [1]:
import sys
home_directory = '/dfs/scratch2/fcipollone'
sys.path.append(home_directory)
import numpy as np
from nbminer.notebook_miner import NotebookMiner
hw_filenames = np.load('../homework_names_jplag_combined_per_student.npy')
hw_notebooks = [[NotebookMiner(filename) for filename in temp[:59]] for temp in hw_filenames]
In [2]:
from nbminer.pipeline.pipeline import Pipeline
from nbminer.features.features import Features
from nbminer.preprocess.get_ast_features import GetASTFeatures
from nbminer.preprocess.get_imports import GetImports
from nbminer.preprocess.resample_by_node import ResampleByNode
from nbminer.encoders.ast_graph.ast_graph import ASTGraphReducer
from nbminer.preprocess.feature_encoding import FeatureEncoding
from nbminer.preprocess.featurize_functions import FeaturizeFunctions
from nbminer.encoders.cluster.kmeans_encoder import KmeansEncoder
from nbminer.results.similarity.jaccard_similarity import NotebookJaccardSimilarity
from nbminer.results.prediction.corpus_identifier import CorpusIdentifier
#a = Features(hw_notebooks[0], 'hw0')
#a.add_notebooks(hw_notebooks[1], 'hw1')
a = Features(hw_notebooks[2], 'hw2')
a.add_notebooks(hw_notebooks[3], 'hw3')
a.add_notebooks(hw_notebooks[4], 'hw4')
a.add_notebooks(hw_notebooks[5], 'hw5')
gastf = GetASTFeatures()
rbn = ResampleByNode()
gi = GetImports()
ff = FeaturizeFunctions()
ci = CorpusIdentifier(feature_name = 'short_function_list')
pipe = Pipeline([gastf, rbn, gi, ff, ci])
a = pipe.transform(a)
In [4]:
import sklearn
X, y = ci.get_data_set()
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
In [5]:
clusterer = sklearn.cluster.KMeans(n_clusters = 4).fit(X)
cluster_score = (sklearn.metrics.silhouette_score(X, clusterer.labels_))
cheat_score = (sklearn.metrics.silhouette_score(X, y))
print('Silhouette score using the actual labels:', cheat_score)
print('Silhouette score using the cluster labels:', cluster_score)
In [10]:
x_reduced = sklearn.decomposition.PCA(n_components=2).fit_transform(X.todense())
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 5, 10
fig, axes = plt.subplots(2)
axes[0].scatter(x_reduced[:,0], x_reduced[:,1], c=y)
axes[0].set_title('PCA Reduced notebooks with original labels')
axes[0].set_xlim(-10,20)
axes[0].set_ylim(-10,15)
axes[1].scatter(x_reduced[:,0], x_reduced[:,1], c=clusterer.labels_)
axes[1].set_title('PCA Reduced notebooks with kmean cluster labels')
axes[1].set_xlim(-10,20)
axes[1].set_ylim(-10,15)
Out[10]:
In [11]:
X, y = ci.get_data_set()
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
X_list = [" ".join(el) for el in X]
tfidf.fit(X_list)
X = tfidf.transform(X_list)
#X = X.todense()
In [12]:
feature_array = np.array(tfidf.get_feature_names())
tfidf_sorting = np.argsort(X.toarray()).flatten()[::-1]
top_n = feature_array[tfidf_sorting][:20]
print(top_n)
In [14]:
X, y = ci.get_data_set()
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join([val for val in el if val in top_n]) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
X = X.todense()
In [15]:
x_reduced = sklearn.decomposition.PCA(n_components=2).fit_transform(X)
print(x_reduced.shape)
plt.rcParams['figure.figsize'] = 5, 5
plt.scatter(x_reduced[:,0], x_reduced[:,1], c=y)
Out[15]:
In [16]:
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
X, y = ci.get_data_set()
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
p = np.random.permutation(len(X.todense()))
X = X.todense()[p]
y = np.array(y)[p]
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=400, max_depth=3)
scores = cross_val_score(clf, X, y, cv=10)
print(scores)
print(np.mean(scores))
clf.fit(X,y)
fnames= countvec.get_feature_names()
clfi = clf.feature_importances_
sa = []
for i in range(len(clfi)):
sa.append((clfi[i], fnames[i]))
sra = [el for el in reversed(sorted(sa))]
In [20]:
top_n = [el[1] for el in sra[:13]]
X, y = ci.get_data_set()
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join([val for val in el if val in top_n]) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
X = X.todense()
In [21]:
x_reduced = sklearn.decomposition.PCA(n_components=2).fit_transform(X)
print(x_reduced.shape)
plt.rcParams['figure.figsize'] = 5, 5
plt.scatter(x_reduced[:,0], x_reduced[:,1], c=y)
Out[21]:
Ok, that actually isn't so bad, but we 'cheated' by using the most predictive features. Is there a way to identify these features apriori? Trying a few more things
In [22]:
from sklearn.cluster import FeatureAgglomeration
X, y = ci.get_data_set()
countvec = sklearn.feature_extraction.text.CountVectorizer()
X_list = [" ".join(el) for el in X]
countvec.fit(X_list)
X = countvec.transform(X_list)
x_reduced=FeatureAgglomeration(n_clusters=2).fit_transform(X.todense())
plt.scatter(x_reduced[:,0], x_reduced[:,1], c=y)
Out[22]:
In [23]:
X, y = ci.get_data_set()
tfidf = sklearn.feature_extraction.text.TfidfVectorizer()
X_list = [" ".join(el) for el in X]
tfidf.fit(X_list)
X = tfidf.transform(X_list)
#X = X.todense()
In [24]:
# This is a recommended step when using T-SNE
x_reduced = sklearn.decomposition.PCA(n_components=50).fit_transform(X.todense())
In [25]:
from sklearn.manifold import TSNE
tsn = TSNE(n_components=2)
x_red = tsn.fit_transform(x_reduced)
plt.scatter(x_red[:,0], x_red[:,1], c=y)
Out[25]:
In [31]:
clusterer = sklearn.cluster.KMeans(n_clusters = 4).fit(x_red)
cluster_score = (sklearn.metrics.silhouette_score(x_red, clusterer.labels_))
cheat_score = (sklearn.metrics.silhouette_score(x_red, y))
print("Sillouette score using the resulting cluster",cluster_score)
print("Sillouette score using the actual labels",cheat_score)
In [32]:
plt.scatter(x_red[:,0], x_red[:,1], c=clusterer.labels_)
Out[32]:
In [ ]: